{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "dc5cd036-af55-4459-b929-d6417f7ab1a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a3a4a277-ea85-4817-a58a-5d67b9accc42",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[DEST] kept 1,650 / 6,005 rows (dest ∈ Seoul).\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\DEST_PANEL_H2H3_2020_2025_DESTinSeoul.parquet\n",
      "[DEST] kept 1,650 / 6,005 rows (dest ∈ Seoul).\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\DEST_PANEL_H2H3_2020_2025_DESTinSeoul.csv\n",
      "[DEST] kept 1,650 / 6,005 rows (dest ∈ Seoul).\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\COMMUTE_PRICE_PANELS_DESTinSeoul.xlsx (engine=openpyxl)\n"
     ]
    }
   ],
   "source": [
    "BASE = Path(r\"E:\\불평등 연구\\데이터\\59_출퇴근불평등\")\n",
    "\n",
    "# Your panels (the script will process whichever exist)\n",
    "ROUTE_CANDS = [\n",
    "    BASE / \"ROUTE_PANEL_H1_2020_2026.parquet\",\n",
    "    BASE / \"ROUTE_PANEL_H1_2020_2026.csv\",\n",
    "    BASE / \"OD_월별_HW_WH_TOTAL_2020_2025_streamed.csv\",\n",
    "]\n",
    "DEST_CANDS = [\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2026.parquet\",\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2026.csv\",\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2025.parquet\",\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2025.csv\",\n",
    "    BASE / \"COMMUTE_PRICE_PANELS.xlsx\",   # sheet name expected: DEST_H2H3\n",
    "]\n",
    "\n",
    "# The 25 Seoul destination codes (as strings)\n",
    "SEOUL_DEST_CODES = {\n",
    "    \"11010\",\"11020\",\"11030\",\"11040\",\"11050\",\n",
    "    \"11060\",\"11070\",\"11080\",\"11090\",\"11100\",\n",
    "    \"11110\",\"11120\",\"11130\",\"11140\",\"11150\",\n",
    "    \"11160\",\"11170\",\"11180\",\"11190\",\"11200\",\n",
    "    \"11210\",\"11220\",\"11230\",\"11240\",\"11250\",\n",
    "}\n",
    "\n",
    "# =========================\n",
    "# 1) Helpers\n",
    "# =========================\n",
    "def _normalize_codes(s: pd.Series) -> pd.Series:\n",
    "    \"\"\"Normalize codes to comparable 5-char strings.\"\"\"\n",
    "    return (s.astype(str)\n",
    "             .str.replace(r\"\\.0$\", \"\", regex=True)\n",
    "             .str.replace(r\"\\D\", \"\", regex=True)\n",
    "             .str.zfill(5))\n",
    "\n",
    "def filter_dest_in_seoul(df: pd.DataFrame,\n",
    "                         dest_col_candidates=(\"도착시군구코드\",\"도착 시군구 코드\",\"dest\",\"district\")) -> pd.DataFrame:\n",
    "    df = df.copy()\n",
    "    dest_col = next((c for c in dest_col_candidates if c in df.columns), None)\n",
    "    if dest_col is None:\n",
    "        raise KeyError(f\"Destination code column not found. Tried: {dest_col_candidates}\")\n",
    "    df[dest_col] = _normalize_codes(df[dest_col])\n",
    "    out = df[df[dest_col].isin(SEOUL_DEST_CODES)].copy()\n",
    "    return out\n",
    "\n",
    "def read_any(path: Path, sheet=None) -> pd.DataFrame:\n",
    "    ext = path.suffix.lower()\n",
    "    if ext == \".parquet\":\n",
    "        return pd.read_parquet(path)\n",
    "    if ext == \".csv\":\n",
    "        return pd.read_csv(path, encoding=\"utf-8-sig\")\n",
    "    if ext in (\".xlsx\", \".xls\"):\n",
    "        try:\n",
    "            return pd.read_excel(path, sheet_name=sheet) if sheet else pd.read_excel(path)\n",
    "        except Exception:\n",
    "            return pd.read_excel(path)\n",
    "    raise ValueError(f\"Unsupported file type: {path}\")\n",
    "\n",
    "def write_like_input(df: pd.DataFrame, in_path: Path, suffix: str, sheet_name: str | None = None) -> Path:\n",
    "    \"\"\"Save using the same extension as input. For Excel, try openpyxl then xlsxwriter. \n",
    "       If both unavailable, fall back to CSV.\"\"\"\n",
    "    out = in_path.with_name(in_path.stem + suffix + in_path.suffix)\n",
    "    ext = out.suffix.lower()\n",
    "\n",
    "    if ext == \".parquet\":\n",
    "        df.to_parquet(out, index=False)\n",
    "        print(f\"[OK] wrote {out}\")\n",
    "        return out\n",
    "\n",
    "    if ext == \".csv\":\n",
    "        df.to_csv(out, index=False, encoding=\"utf-8-sig\")\n",
    "        print(f\"[OK] wrote {out}\")\n",
    "        return out\n",
    "\n",
    "    if ext in (\".xlsx\", \".xls\"):\n",
    "        # Try openpyxl first, then xlsxwriter\n",
    "        last_err = None\n",
    "        for eng in (\"openpyxl\", \"xlsxwriter\"):\n",
    "            try:\n",
    "                with pd.ExcelWriter(out, engine=eng) as xw:\n",
    "                    df.to_excel(xw, index=False, sheet_name=sheet_name or \"Sheet1\")\n",
    "                print(f\"[OK] wrote {out} (engine={eng})\")\n",
    "                return out\n",
    "            except Exception as e:\n",
    "                last_err = e\n",
    "        # Fallback to CSV if Excel engines unavailable\n",
    "        out_csv = out.with_suffix(\".csv\")\n",
    "        df.to_csv(out_csv, index=False, encoding=\"utf-8-sig\")\n",
    "        print(f\"[WARN] Could not write Excel ({last_err}). Wrote CSV instead: {out_csv}\")\n",
    "        return out_csv\n",
    "\n",
    "    # Default fallback\n",
    "    out_csv = out.with_suffix(\".csv\")\n",
    "    df.to_csv(out_csv, index=False, encoding=\"utf-8-sig\")\n",
    "    print(f\"[OK] wrote {out_csv}\")\n",
    "    return out_csv\n",
    "\n",
    "# =========================\n",
    "# 2) Run filtering for all files found\n",
    "# =========================\n",
    "for p in ROUTE_CANDS:\n",
    "    if p.exists():\n",
    "        df = read_any(p)\n",
    "        df_f = filter_dest_in_seoul(df)\n",
    "        print(f\"[ROUTE] kept {len(df_f):,} / {len(df):,} rows (dest ∈ Seoul).\")\n",
    "        write_like_input(df_f, p, \"_DESTinSeoul\")\n",
    "\n",
    "for p in DEST_CANDS:\n",
    "    if p.exists():\n",
    "        sheet = \"DEST_H2H3\" if p.suffix.lower() in (\".xlsx\",\".xls\") else None\n",
    "        df = read_any(p, sheet=sheet)\n",
    "        df_f = filter_dest_in_seoul(df)\n",
    "        print(f\"[DEST] kept {len(df_f):,} / {len(df):,} rows (dest ∈ Seoul).\")\n",
    "        write_like_input(df_f, p, \"_DESTinSeoul\", sheet_name=sheet or \"DEST_H2H3\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70c1df52-4b71-4d56-9ab7-165089215f4d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "dad525b2-d0e6-4e16-9d4c-3cb9f90fa897",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[H1/ROUTE] kept 158,070 / 271,902 rows (dest ∈ Seoul) from ROUTE_PANEL_H1_2020_2025.parquet.\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\ROUTE_PANEL_H1_2020_2025_DESTinSeoul.parquet\n",
      "[H1/ROUTE] kept 158,070 / 271,902 rows (dest ∈ Seoul) from ROUTE_PANEL_H1_2020_2025.csv.\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\ROUTE_PANEL_H1_2020_2025_DESTinSeoul.csv\n",
      "[H2/H3/DEST] kept 1,650 / 6,005 rows (dest ∈ Seoul) from DEST_PANEL_H2H3_2020_2025.parquet.\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\DEST_PANEL_H2H3_2020_2025_DESTinSeoul.parquet\n",
      "[H2/H3/DEST] kept 1,650 / 6,005 rows (dest ∈ Seoul) from DEST_PANEL_H2H3_2020_2025.csv.\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\DEST_PANEL_H2H3_2020_2025_DESTinSeoul.csv\n",
      "[H2/H3/DEST] kept 1,650 / 6,005 rows (dest ∈ Seoul) from COMMUTE_PRICE_PANELS.xlsx.\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\COMMUTE_PRICE_PANELS_DESTinSeoul.xlsx (engine=openpyxl)\n"
     ]
    }
   ],
   "source": [
    "BASE = Path(r\"E:\\불평등 연구\\데이터\\59_출퇴근불평등\")\n",
    "\n",
    "# H1 (route/OD) panels — the script will process whichever exist\n",
    "ROUTE_CANDS = [\n",
    "    BASE / \"ROUTE_PANEL_H1_2020_2026.parquet\",\n",
    "    BASE / \"ROUTE_PANEL_H1_2020_2026.csv\",\n",
    "    BASE / \"ROUTE_PANEL_H1_2020_2026.xlsx\",\n",
    "    BASE / \"ROUTE_PANEL_H1_2020_2025.parquet\",\n",
    "    BASE / \"ROUTE_PANEL_H1_2020_2025.csv\",\n",
    "    BASE / \"ROUTE_PANEL_H1_2020_2025.xlsx\",\n",
    "    BASE / \"OD_월별_HW_WH_TOTAL_2020_2025_streamed.csv\",  # if you still keep this\n",
    "]\n",
    "\n",
    "# H2/H3 (destination) panels\n",
    "DEST_CANDS = [\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2026.parquet\",\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2026.csv\",\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2026.xlsx\",\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2025.parquet\",\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2025.csv\",\n",
    "    BASE / \"DEST_PANEL_H2H3_2020_2025.xlsx\",\n",
    "    BASE / \"COMMUTE_PRICE_PANELS.xlsx\",   # sheet: DEST_H2H3 (if used)\n",
    "]\n",
    "\n",
    "# The 25 Seoul destination codes (as strings)\n",
    "SEOUL_DEST_CODES = {\n",
    "    \"11010\",\"11020\",\"11030\",\"11040\",\"11050\",\n",
    "    \"11060\",\"11070\",\"11080\",\"11090\",\"11100\",\n",
    "    \"11110\",\"11120\",\"11130\",\"11140\",\"11150\",\n",
    "    \"11160\",\"11170\",\"11180\",\"11190\",\"11200\",\n",
    "    \"11210\",\"11220\",\"11230\",\"11240\",\"11250\",\n",
    "}\n",
    "\n",
    "# =========================\n",
    "# Helpers\n",
    "# =========================\n",
    "def _normalize_codes(s: pd.Series) -> pd.Series:\n",
    "    \"\"\"Normalize codes to comparable 5-char strings.\"\"\"\n",
    "    return (s.astype(str)\n",
    "             .str.replace(r\"\\.0$\", \"\", regex=True)\n",
    "             .str.replace(r\"\\D\", \"\", regex=True)\n",
    "             .str.zfill(5))\n",
    "\n",
    "def filter_dest_in_seoul(df: pd.DataFrame,\n",
    "                         dest_col_candidates=(\"도착시군구코드\",\"도착 시군구 코드\",\"dest\",\"district\")) -> pd.DataFrame:\n",
    "    df = df.copy()\n",
    "    dest_col = next((c for c in dest_col_candidates if c in df.columns), None)\n",
    "    if dest_col is None:\n",
    "        raise KeyError(f\"Destination code column not found. Tried: {dest_col_candidates}\")\n",
    "    df[dest_col] = _normalize_codes(df[dest_col])\n",
    "    out = df[df[dest_col].isin(SEOUL_DEST_CODES)].copy()\n",
    "    return out\n",
    "\n",
    "def read_any(path: Path, sheet=None) -> pd.DataFrame:\n",
    "    ext = path.suffix.lower()\n",
    "    if ext == \".parquet\":\n",
    "        return pd.read_parquet(path)\n",
    "    if ext == \".csv\":\n",
    "        return pd.read_csv(path, encoding=\"utf-8-sig\")\n",
    "    if ext in (\".xlsx\", \".xls\"):\n",
    "        try:\n",
    "            return pd.read_excel(path, sheet_name=sheet) if sheet else pd.read_excel(path)\n",
    "        except Exception:\n",
    "            return pd.read_excel(path)\n",
    "    raise ValueError(f\"Unsupported file type: {path}\")\n",
    "\n",
    "def write_like_input(df: pd.DataFrame, in_path: Path, suffix: str, sheet_name: str | None = None) -> Path:\n",
    "    \"\"\"Save using the same extension as input. For Excel, try openpyxl then xlsxwriter.\n",
    "       If both unavailable, fall back to CSV.\"\"\"\n",
    "    out = in_path.with_name(in_path.stem + suffix + in_path.suffix)\n",
    "    ext = out.suffix.lower()\n",
    "\n",
    "    if ext == \".parquet\":\n",
    "        df.to_parquet(out, index=False)\n",
    "        print(f\"[OK] wrote {out}\")\n",
    "        return out\n",
    "\n",
    "    if ext == \".csv\":\n",
    "        df.to_csv(out, index=False, encoding=\"utf-8-sig\")\n",
    "        print(f\"[OK] wrote {out}\")\n",
    "        return out\n",
    "\n",
    "    if ext in (\".xlsx\", \".xls\"):\n",
    "        last_err = None\n",
    "        for eng in (\"openpyxl\", \"xlsxwriter\"):\n",
    "            try:\n",
    "                with pd.ExcelWriter(out, engine=eng) as xw:\n",
    "                    df.to_excel(xw, index=False, sheet_name=sheet_name or \"Sheet1\")\n",
    "                print(f\"[OK] wrote {out} (engine={eng})\")\n",
    "                return out\n",
    "            except Exception as e:\n",
    "                last_err = e\n",
    "        # Fallback to CSV if Excel engines unavailable\n",
    "        out_csv = out.with_suffix(\".csv\")\n",
    "        df.to_csv(out_csv, index=False, encoding=\"utf-8-sig\")\n",
    "        print(f\"[WARN] Could not write Excel ({last_err}). Wrote CSV instead: {out_csv}\")\n",
    "        return out_csv\n",
    "\n",
    "    # Default fallback\n",
    "    out_csv = out.with_suffix(\".csv\")\n",
    "    df.to_csv(out_csv, index=False, encoding=\"utf-8-sig\")\n",
    "    print(f\"[OK] wrote {out_csv}\")\n",
    "    return out_csv\n",
    "\n",
    "# =========================\n",
    "# 1) Filter H1 (route) files\n",
    "# =========================\n",
    "for p in ROUTE_CANDS:\n",
    "    if p.exists():\n",
    "        df = read_any(p)\n",
    "        df_f = filter_dest_in_seoul(df)\n",
    "        print(f\"[H1/ROUTE] kept {len(df_f):,} / {len(df):,} rows (dest ∈ Seoul) from {p.name}.\")\n",
    "        write_like_input(df_f, p, \"_DESTinSeoul\")\n",
    "\n",
    "# =========================\n",
    "# 2) Filter H2/H3 (destination) files\n",
    "# =========================\n",
    "for p in DEST_CANDS:\n",
    "    if p.exists():\n",
    "        sheet = \"DEST_H2H3\" if p.suffix.lower() in (\".xlsx\",\".xls\") and p.name.upper().endswith(\".XLSX\") else None\n",
    "        # If it's a generic workbook (COMMUTE_PRICE_PANELS.xlsx), try the expected sheet\n",
    "        if p.name == \"COMMUTE_PRICE_PANELS.xlsx\":\n",
    "            sheet = \"DEST_H2H3\"\n",
    "        df = read_any(p, sheet=sheet)\n",
    "        df_f = filter_dest_in_seoul(df)\n",
    "        print(f\"[H2/H3/DEST] kept {len(df_f):,} / {len(df):,} rows (dest ∈ Seoul) from {p.name}.\")\n",
    "        write_like_input(df_f, p, \"_DESTinSeoul\", sheet_name=sheet or \"DEST_H2H3\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "380392c5-0f39-4af5-b32f-c52b51b8f75c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c18f20b-51ea-4689-aaa4-8f341b59edbb",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63a6682d-9561-4e6f-9961-b3fd4a0c7dbc",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ab86738-983c-4919-9867-7054a460fd98",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9241548-8392-47fc-8c89-118113dc4d52",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae0ac742-289f-46bd-842c-737da9250d8c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
